import requests
import sys
import socket
from lxml import etree
from DOING.dao_报录比 import DAO_报录比
'''
readme:

2014 2015 2016三年，网页内容布局一样。
    解决方法：
        ones = element.xpath('//table[2]//tr')
2017年与另外3年，网页布局不同。
    解决方法：
        ones = element.xpath('//table//tr')
        
        
另外，每年爬取时需要改两个地方：
    1.   res_year = 2017
    2.   response = requests.get(url_2017, headers=headers)
'''
class test_报录比:
    def tansfer (self, path):
        nums = one.xpath(path)[0]
        try:
            num = int(nums)
        except:
            num = 0
        return num

if __name__ == '__main__':
    dao = DAO_报录比()
    tansfer = test_报录比()
    url_2014 = 'http://kaoyan.eol.cn/tiao_ji/baolubi/201509/t20150901_1310585.shtml'
    url_2015 = 'http://kaoyan.eol.cn/tiao_ji/baolubi/201509/t20150901_1310587.shtml'
    url_2016 = 'http://kaoyan.eol.cn/tiao_ji/baolubi/201707/t20170726_1543659.shtml'
    url_2017 = 'http://kaoyan.eol.cn/tiao_ji/baolubi/201807/t20180730_1619329.shtml'
    res_year = 2014
    socket.setdefaulttimeout(5000)
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"}
    response = requests.get(url_2014, headers=headers, timeout=5000)
    html = response.text
    # html的编码格式是ISO-8859-1
    html_z = html.encode('ISO-8859-1')
    # string的编码格式是utf-8
    string = html_z.decode('utf-8')
    # print(string)
    element = etree.HTML(string)
    ones = element.xpath('//table[2]//tr')
    i = 0
    for one in ones:
        if i == 0:
            i = i+1
            continue
        maj_name = str(one.xpath('.//td[2]/text()')[0])
        res_register_num = tansfer.tansfer('.//td[3]/text()')
        res_enroll_num = tansfer.tansfer('.//td[4]/text()')
        print(i, maj_name, res_register_num, res_enroll_num)
        dao.add_报录比(res_year, res_register_num, res_enroll_num, -1, '北京师范大学', '', maj_name)
        i = i+1

